print(paste0("criterion = ", criterion,"   --   year = ",list_year[w], " tot_income estimate_OLS  = ",shape_OLS, " tot_income estimate MLE  = ",shape_MLE))
}
## redo it, but this time by controlling for age given adjustment factors.
PSID_data = PSID_data[which(PSID_data$age_head %in% c(19:80)),]
## redefine business income
data_factor_bus_income = factor_adj[which(list_category == "bus_income")]
PSID_data[,names(PSID_data) == paste0("business_labor_income_head_adj")] <- as.numeric(PSID_data[,names(PSID_data) == paste0("business_labor_income_head")])/as.numeric(data_factor_bus_income)
PSID_data[,names(PSID_data) == paste0("business_labor_income_wife_adj")] <- as.numeric(PSID_data[,names(PSID_data) == paste0("business_labor_income_wife")])/as.numeric(data_factor_bus_income)
PSID_data[,names(PSID_data) == paste0("business_asset_income_head_adj")] <- as.numeric(PSID_data[,names(PSID_data) == paste0("business_asset_income_head")])/as.numeric(data_factor_bus_income)
PSID_data[,names(PSID_data) == paste0("business_asset_income_wife_adj")] <- as.numeric(PSID_data[,names(PSID_data) == paste0("business_asset_income_wife")])/as.numeric(data_factor_bus_income)
sum(PSID_data$business_labor_income_head,na.rm=T)/sum(PSID_data$business_labor_income_head_adj,na.rm=T)
sum(PSID_data$business_labor_income_wife,na.rm=T)/sum(PSID_data$business_labor_income_wife_adj,na.rm=T)
###
if(type_income == "capital_income"){
PSID_data$variable     = PSID_data$tot_realized_cap_inc + (PSID_data$business_asset_income_head + PSID_data$business_asset_income_wife) + PSID_data$neg_mortgage
PSID_data$variable_adj = PSID_data$tot_realized_cap_inc + (PSID_data$business_asset_income_head_adj + PSID_data$business_asset_income_wife_adj) + PSID_data$neg_mortgage
}
if(type_income == "labor_income"){
PSID_data$variable     = PSID_data$labor_income + (PSID_data$business_labor_income_head + PSID_data$business_labor_income_wife)
PSID_data$variable_adj = PSID_data$labor_income_adj + (PSID_data$business_labor_income_head_adj + PSID_data$business_labor_income_wife_adj)
}
if(type_income == "total_income"){
PSID_data$variable     = rowSums(PSID_data[,names(PSID_data) %in% paste0(list_category)])
PSID_data$variable_adj = rowSums(PSID_data[,names(PSID_data) %in% paste0(list_category,"_adj")])
}
if(type_income == "after_tax_labor_income"){
PSID_data$variable     = PSID_data$labor_income + (PSID_data$business_labor_income_head + PSID_data$business_labor_income_wife)
family_income <- data.frame(
taxsimid  = PSID_data$taxsimid,
state     = ifelse(is.na(PSID_data$STATE_CODE),0,PSID_data$STATE_CODE),
year      = ifelse(is.na(PSID_data$year_survey_l1),0,PSID_data$year_survey_l1),
mstat     = ifelse(is.na(PSID_data$mstat),0,PSID_data$mstat),
pwages    = ifelse(is.na(PSID_data$labor_inc_head/new_factor_adj[3]),0,PSID_data$labor_inc_head/new_factor_adj[3]), # primary wages
swages    = ifelse(is.na(PSID_data$labor_inc_wife/new_factor_adj[3]),0,PSID_data$labor_inc_wife/new_factor_adj[3]),
depx      = ifelse(is.na(PSID_data$depx),0,PSID_data$depx),
psemp     = ifelse(is.na(PSID_data$business_labor_income_head_adj),0,PSID_data$business_labor_income_head_adj),
ssemp     = ifelse(is.na(PSID_data$business_labor_income_wife_adj),0,PSID_data$business_labor_income_wife_adj),
page      = ifelse(is.na(PSID_data$page),0,PSID_data$page) # primary age
)
family_taxes <- taxsim_calculate_taxes(
.data = family_income,
marginal_tax_rates = 'Wages',
return_all_information = FALSE
)
family_taxes$total_labor_taxes_adj = family_taxes$fiitax + family_taxes$siitax + family_taxes$tfica
family_taxes = family_taxes[,c("taxsimid","total_labor_taxes_adj")]
PSID_data = merge(PSID_data,family_taxes,by=c("taxsimid"))
PSID_data$variable_adj = PSID_data$labor_income_adj + (PSID_data$business_labor_income_head_adj + PSID_data$business_labor_income_wife_adj) - PSID_data$total_labor_taxes_adj
}
if(type_income == "after_tax_total_income"){
PSID_data$variable     = rowSums(PSID_data[,names(PSID_data) %in% paste0(list_category)]) - PSID_data$total_income_taxes
family_income <- data.frame(
taxsimid  = PSID_data$taxsimid,
state     = ifelse(is.na(PSID_data$STATE_CODE),0,PSID_data$STATE_CODE),
year      = ifelse(is.na(PSID_data$year_survey_l1),0,PSID_data$year_survey_l1),
mstat     = ifelse(is.na(PSID_data$mstat),0,PSID_data$mstat),
pwages    = ifelse(is.na(PSID_data$labor_inc_head/new_factor_adj[3]),0,PSID_data$labor_inc_head/new_factor_adj[3]), # primary wages
swages    = ifelse(is.na(PSID_data$labor_inc_wife/new_factor_adj[3]),0,PSID_data$labor_inc_wife/new_factor_adj[3]),
depx      = ifelse(is.na(PSID_data$depx),0,PSID_data$depx),
psemp     = ifelse(is.na(PSID_data$business_labor_income_head_adj + PSID_data$business_asset_income_head_adj),0,PSID_data$business_labor_income_head_adj + PSID_data$business_asset_income_head_adj),
ssemp     = ifelse(is.na(PSID_data$business_labor_income_wife_adj + PSID_data$business_asset_income_wife_adj),0,PSID_data$business_labor_income_wife_adj + PSID_data$business_asset_income_wife_adj),
dividends = ifelse(is.na(PSID_data$dividends/factor_adj[2]),0,PSID_data$dividends/factor_adj[2]),
intrec    = ifelse(is.na(PSID_data$intrec/factor_adj[2]),0,PSID_data$intrec/factor_adj[2]),
otherprop = ifelse(is.na(PSID_data$otherprop/factor_adj[2]),0,PSID_data$otherprop/factor_adj[2]),
pensions  = ifelse(is.na(PSID_data$pensions),0,PSID_data$pensions),
gssi      = ifelse(is.na(PSID_data$gssi/factor_adj[1]),0,PSID_data$gssi/factor_adj[1]),
pui       = ifelse(is.na(PSID_data$pui/factor_adj[1]),0,PSID_data$pui/factor_adj[1]),
sui       = ifelse(is.na(PSID_data$sui/factor_adj[1]),0,PSID_data$sui/factor_adj[1]),
trans_TAXS = ifelse(is.na(PSID_data$trans_TAXS/factor_adj[1]),0,PSID_data$trans_TAXS/factor_adj[1]),
rent      = ifelse(is.na(PSID_data$rent/factor_adj[2]),0,PSID_data$rent/factor_adj[2]),
proptax   = ifelse(is.na(PSID_data$proptax),0,PSID_data$proptax),
mortgage  = ifelse(is.na(PSID_data$mortgage),0,PSID_data$mortgage),
page      = ifelse(is.na(PSID_data$page),0,PSID_data$page) # primary age
)
family_taxes <- taxsim_calculate_taxes(
.data = family_income,
marginal_tax_rates = 'Wages',
return_all_information = FALSE
)
family_taxes$total_taxes_adj = family_taxes$fiitax + family_taxes$siitax + family_taxes$tfica
family_taxes = family_taxes[,c("taxsimid","total_taxes_adj")]
PSID_data = merge(PSID_data,family_taxes,by=c("taxsimid"))
PSID_data$variable_adj = rowSums(PSID_data[,names(PSID_data) %in% paste0(list_category,"_adj")]) - PSID_data$total_taxes_adj  ## do not adjust in this case
}
if(scale_obs == 1){
PSID_data$variable     = PSID_data$variable/PSID_data$scale
PSID_data$variable_adj     = PSID_data$variable_adj/PSID_data$scale
}
PSID_data$variable  = as.numeric(PSID_data$variable)
PSID_data$surweight = as.numeric(PSID_data$surweight)
## UNADJUSTED
if(residual_OLG == 1){
if(type_income == "capital_income"){
PSID_data = PSID_data[which(!is.na(PSID_data$variable)),]
}
mean_variable       = weighted.mean(PSID_data$variable,w = PSID_data$surweight,na.rm=T)
model_residual      = lm((variable) ~ (age_head) + I(age_head^2) + I(age_head^3) + I(age_head^4) + I(age_head^5), data = PSID_data,weights = PSID_data$surweight)
PSID_data$variable  = (model_residual$residuals) + mean_variable
}
## without adjustments:
DATA_all = as.data.frame(cbind(ifelse(is.na(PSID_data$weight_cross),0,PSID_data$weight_cross),ifelse(is.na(PSID_data$variable),0,PSID_data$variable)))
colnames(DATA_all) <- c("surweight","variable")
if(remove_zero == 1){
DATA_all = DATA_all[which(DATA_all$variable > 0),]
}
DATA_all$variable  = DATA_all$variable/weighted.mean(DATA_all$variable,w = DATA_all$surweight, na.rm=T)
## check this.
data_check <- pareto_shape_thresholds(DATA_all$variable,DATA_all$surweight,minquant,maxquant,stepquant)
data_check$criterion_MLE = data_check$maxKS_MLE #+ data_check$maxKS_OLS
data_check$criterion_OLS = data_check$maxKS_OLS #+ data_check$maxKS_MLE
data_check$criterion_NLS = data_check$maxKS_NLS #+ data_check$maxKS_MLE
plot(seq(minquant,maxquant,by=stepquant),data_check$OLS_estimate)
plot(seq(minquant,maxquant,by=stepquant),data_check$MLE_estimate)
plot(seq(minquant,maxquant,by=stepquant),data_check$NLS_estimate)
plot(seq(minquant,maxquant,by=stepquant),data_check$criterion_MLE)
plot(seq(minquant,maxquant,by=stepquant),data_check$criterion_OLS)
plot(seq(minquant,maxquant,by=stepquant),data_check$criterion_NLS)
## restrict values from eyeballing.
data_check$criterion_MLE[data_check$quantvalue > maxquant_val_nadj[w] | data_check$quantvalue < minquant_val_nadj[w]] <- 10
data_check$criterion_OLS[data_check$quantvalue > maxquant_val_nadj[w] | data_check$quantvalue < minquant_val_nadj[w]] <- 10
data_check$criterion_NLS[data_check$quantvalue > maxquant_val_nadj[w] | data_check$quantvalue < minquant_val_nadj[w]] <- 10
## select the optimal criterion
Rmin_OLS_nadj  = min(data_check$criterion_OLS)
Rmin_MLE_nadj  = min(data_check$criterion_MLE)
Rmin_NLS_nadj  = min(data_check$criterion_NLS)
xmin_OLS_nadj  = data_check$quantthreshold[data_check$criterion_OLS == min(data_check$criterion_OLS)][1]
xmin_MLE_nadj  = data_check$quantthreshold[data_check$criterion_MLE == min(data_check$criterion_MLE)][1]
xmin_NLS_nadj  = data_check$quantthreshold[data_check$criterion_NLS == min(data_check$criterion_NLS)][1]
shape_OLS_nadj = data_check$OLS_estimate[data_check$criterion_OLS   == min(data_check$criterion_OLS)][1]
shape_MLE_nadj = data_check$MLE_estimate[data_check$criterion_MLE   == min(data_check$criterion_MLE)][1]
shape_NLS_nadj = data_check$NLS_estimate[data_check$criterion_NLS   == min(data_check$criterion_NLS)][1]
quant_OLS_nadj = data_check$quantvalue[data_check$criterion_OLS     == min(data_check$criterion_OLS)][1]
quant_MLE_nadj = data_check$quantvalue[data_check$criterion_MLE     == min(data_check$criterion_MLE)][1]
quant_NLS_nadj = data_check$quantvalue[data_check$criterion_NLS     == min(data_check$criterion_NLS)][1]
## variance (2)
data2 = DATA_all[which(DATA_all$variable > reldist::wtd.quantile(DATA_all$variable, q = 0.8, na.rm=T, weight = DATA_all$surweight)),]
meanlog08_nadj = weighted.mean(x=log(data2$variable),w=data2$surweight)
varlog08_nadj  = as.numeric(weighted.var(x=log(data2$variable),w=data2$surweight))
skewlog08_nadj = w.skewness(x=log(data2$variable),mu=data2$surweight)
skew08_nadj    = w.skewness(x=(data2$variable),mu=data2$surweight)
## 90/10 ratio
top90_nadj = reldist::wtd.quantile(DATA_all$variable, q = 0.9, na.rm=T, weight = DATA_all$surweight)
top50_nadj = reldist::wtd.quantile(DATA_all$variable, q = 0.5, na.rm=T, weight = DATA_all$surweight)
ratio9050_nadj = top90_nadj/top50_nadj
## gini
gini_nadj = as.numeric(Gini_RSV(DATA_all$variable,DATA_all$surweight))
## step 3. eyeballing test
eplot_OLS_nonadj[[w]] <- eyeball_test(DATA_all$variable,DATA_all$surweight,xmin_OLS_nadj,"OLS",list_year[w])
eplot_MLE_nonadj[[w]] <- eyeball_test(DATA_all$variable,DATA_all$surweight,xmin_MLE_nadj,"MLE",list_year[w])
eplot_NLS_nonadj[[w]] <- eyeball_test(DATA_all$variable,DATA_all$surweight,xmin_NLS_nadj,"NLS",list_year[w])
data_save_check_nadj[[w]] = data_check
## print the values
print(paste0("shape_OLS_nadj == ",shape_OLS_nadj,"   shape_MLE_nadj == ",shape_MLE_nadj,"   shape_NLS_nadj == ",shape_NLS_nadj))
## ADJUSTED
PSID_data$variable_adj  = as.numeric(PSID_data$variable_adj)
PSID_data$surweight     = as.numeric(PSID_data$surweight)
if(residual_OLG == 1){
if(type_income == "capital_income"){
PSID_data = PSID_data[which(!is.na(PSID_data$variable_adj)),]
}
mean_variable_adj      = weighted.mean(PSID_data$variable_adj,w = PSID_data$surweight)
model_residual          = lm((variable_adj) ~ (age_head) + I(age_head^2) + I(age_head^3) + I(age_head^4) + I(age_head^5), data = PSID_data, weights=PSID_data$surweight)
PSID_data$variable_adj = (model_residual$residuals) + mean_variable_adj  #exp(model_residual$residuals) # + mean_variable_adj
table(PSID_data$variable_adj > 0)
}
## step 2. estimate a distribution to better account for misrepresentation.
# based on the best KS criterion.
DATA_all = as.data.frame(cbind(ifelse(is.na(PSID_data$weight_cross),0,PSID_data$weight_cross),ifelse(is.na(PSID_data$variable_adj),0,PSID_data$variable_adj)))
colnames(DATA_all) <- c("surweight","variable_adj")
if(remove_zero == 1){
DATA_all = DATA_all[which(DATA_all$variable_adj > 0),]
}
DATA_all$variable_adj  = DATA_all$variable/weighted.mean(DATA_all$variable_adj,w = DATA_all$surweight, na.rm=T)
## check this.
data_check <- pareto_shape_thresholds(DATA_all$variable_adj,DATA_all$surweight,minquant,maxquant,stepquant)
data_check$criterion_MLE = data_check$maxKS_MLE #+ data_check$maxKS_OLS
data_check$criterion_OLS = data_check$maxKS_OLS #+ data_check$maxKS_MLE
data_check$criterion_NLS = data_check$maxKS_NLS #+ data_check$maxKS_MLE
plot(seq(minquant,maxquant,by=stepquant),data_check$OLS_estimate)
plot(seq(minquant,maxquant,by=stepquant),data_check$MLE_estimate)
plot(seq(minquant,maxquant,by=stepquant),data_check$NLS_estimate)
plot(seq(minquant,maxquant,by=stepquant),data_check$criterion_MLE)
plot(seq(minquant,maxquant,by=stepquant),data_check$criterion_OLS)
plot(seq(minquant,maxquant,by=stepquant),data_check$criterion_NLS)
## restrict values from eyeballing.
data_check$criterion_MLE[data_check$quantvalue > maxquant_val_adj[w] | data_check$quantvalue < minquant_val_adj[w]] <- 10
data_check$criterion_OLS[data_check$quantvalue > maxquant_val_adj[w] | data_check$quantvalue < minquant_val_adj[w]] <- 10
data_check$criterion_NLS[data_check$quantvalue > maxquant_val_adj[w] | data_check$quantvalue < minquant_val_adj[w]] <- 10
## select the optimal criterion
Rmin_OLS  = min(data_check$criterion_OLS)
Rmin_MLE  = min(data_check$criterion_MLE)
Rmin_NLS  = min(data_check$criterion_NLS)
xmin_OLS  = data_check$quantthreshold[data_check$criterion_OLS == min(data_check$criterion_OLS)][1]
xmin_MLE  = data_check$quantthreshold[data_check$criterion_MLE == min(data_check$criterion_MLE)][1]
xmin_NLS  = data_check$quantthreshold[data_check$criterion_NLS == min(data_check$criterion_NLS)][1]
shape_OLS = data_check$OLS_estimate[data_check$criterion_OLS   == min(data_check$criterion_OLS)][1]
shape_MLE = data_check$MLE_estimate[data_check$criterion_MLE   == min(data_check$criterion_MLE)][1]
shape_NLS = data_check$NLS_estimate[data_check$criterion_NLS   == min(data_check$criterion_NLS)][1]
quant_OLS = data_check$quantvalue[data_check$criterion_OLS     == min(data_check$criterion_OLS)][1]
quant_MLE = data_check$quantvalue[data_check$criterion_MLE     == min(data_check$criterion_MLE)][1]
quant_NLS = data_check$quantvalue[data_check$criterion_NLS     == min(data_check$criterion_NLS)][1]
## variance (2)
data2     = DATA_all[which(DATA_all$variable_adj > reldist::wtd.quantile(DATA_all$variable_adj, q = 0.8, na.rm=T, weight = DATA_all$surweight)),]
meanlog08 = weighted.mean(x=log(data2$variable_adj),w=data2$surweight)
varlog08  = as.numeric(weighted.var(x=log(data2$variable_adj),w=data2$surweight))
skewlog08 = w.skewness(x=log(data2$variable_adj),mu=data2$surweight)
skew08    = w.skewness(x=(data2$variable_adj),mu=data2$surweight)
## gini
gini_adj = as.numeric(Gini_RSV(DATA_all$variable_adj,DATA_all$surweight))
## 90/10 ratio
top90 = reldist::wtd.quantile(DATA_all$variable, q = 0.9, na.rm=T, weight = DATA_all$surweight)
top50 = reldist::wtd.quantile(DATA_all$variable, q = 0.5, na.rm=T, weight = DATA_all$surweight)
ratio9050 = top90/top50
## data save
data_tmp_year = as.data.frame(cbind(list_year[w],Rmin_OLS,Rmin_MLE,xmin_OLS,xmin_MLE,shape_OLS, shape_MLE,shape_NLS,quant_OLS,quant_MLE,quant_NLS,meanlog08,
varlog08,skewlog08,skew08,gini_adj,ratio9050,Rmin_OLS_nadj,Rmin_MLE_nadj,xmin_OLS_nadj,xmin_MLE_nadj,shape_OLS_nadj, shape_MLE_nadj,shape_NLS_nadj,quant_OLS_nadj,quant_MLE_nadj,quant_NLS_nadj,meanlog08_nadj,
varlog08_nadj,skewlog08_nadj,skew08_nadj,gini_nadj,ratio9050_nadj))
data_all_year = rbind(data_all_year,data_tmp_year)
## step 3. eyeballing test
eplot_OLS[[w]] <- eyeball_test(DATA_all$variable,DATA_all$surweight,xmin_OLS,"OLS",list_year[w])
eplot_MLE[[w]] <- eyeball_test(DATA_all$variable,DATA_all$surweight,xmin_MLE,"MLE",list_year[w])
eplot_NLS[[w]] <- eyeball_test(DATA_all$variable_adj,DATA_all$surweight,xmin_NLS,"NLS",list_year[w])
## reconstruct the initial distribution.
data_check$year               = list_year[w]
DATA_all$year                 = list_year[w]
data_check$OLS_best_estimate  = shape_OLS
data_check$MLE_best_estimate  = shape_MLE
data_save_check[[w]]          = data_check
data_wealth_adjusted[[w]]     = DATA_all
data_wealth_non_adjusted[[w]] = PSID_data
MLE_intercept[w]              = data_check$MLE_estimate[data_check$criterion_MLE == min(data_check$criterion_MLE)][1]
OLS_intercept[w]              = data_check$OLS_estimate[data_check$criterion_OLS == min(data_check$criterion_OLS)][1]
NLS_intercept[w]              = data_check$NLS_estimate[data_check$criterion_NLS == min(data_check$criterion_NLS)][1]
## final results:
print(paste0("shape_OLS_adj == ",shape_OLS,"   shape_MLE_adj == ",shape_MLE,"   shape_NLS_adj == ",shape_NLS))
## print the values
print(paste0("Country == USA    year == ",list_year[w],"  p99 = ",round(compute_share(0.99,DATA_all$variable_adj,DATA_all$surweight),3),
"  p99_surv = ",round(compute_share(0.99,PSID_data$variable,PSID_data$weight_cross),3)))
}
ggarrange(plotlist = eplot_OLS)
ggarrange(plotlist = eplot_MLE)
ggarrange(plotlist = eplot_NLS)
#ggarrange(plotlist = eplot_OLS_nonadj)
#ggarrange(plotlist = eplot_MLE_nonadj)
apply(data_all_year[,names(data_all_year) %in% c("gini_adj","quant_OLS","shape_OLS","quant_MLE","shape_MLE","quant_NLS","shape_NLS","varlog08","meanlog08","skewlog08","ratio9050")],2,median)
apply(data_all_year[,names(data_all_year) %in% c("gini_adj","quant_OLS","shape_OLS","quant_MLE","shape_MLE","quant_NLS","shape_NLS","varlog08","meanlog08","skewlog08","ratio9050")],2,mean)
apply(data_all_year[,names(data_all_year) %in% c("gini_adj","quant_OLS","shape_OLS","quant_MLE","shape_MLE","quant_NLS","shape_NLS","varlog08","meanlog08","skewlog08","ratio9050")],2,sd)
apply(data_all_year[,names(data_all_year) %in% c("gini_nadj","quant_OLS_nadj","shape_OLS_nadj","quant_MLE_nadj","shape_MLE_nadj","quant_NLS_nadj","shape_NLS_nadj","varlog08_nadj","meanlog08_nadj","skewlog08_nadj","ratio9050_nadj")],2,median)
apply(data_all_year[,names(data_all_year) %in% c("gini_nadj","quant_OLS_nadj","shape_OLS_nadj","quant_MLE_nadj","shape_MLE_nadj","quant_NLS_nadj","shape_NLS_nadj","varlog08_nadj","meanlog08_nadj","skewlog08_nadj","ratio9050_nadj")],2,mean)
apply(data_all_year[,names(data_all_year) %in% c("gini_nadj","quant_OLS_nadj","shape_OLS_nadj","quant_MLE_nadj","shape_MLE_nadj","quant_NLS_nadj","shape_NLS_nadj","varlog08_nadj","meanlog08_nadj","skewlog08_nadj","ratio9050_nadj")],2,sd)
## full figure
data_tmp           = rbindlist(data_save_check)
data_tmp           = data_tmp[,c("MLE_estimate","OLS_estimate","year","quantvalue","OLS_best_estimate","MLE_best_estimate")]
data_tmp$year_OLS  = paste0(data_tmp$year," (",round(data_tmp$OLS_best_estimate,2),")")
data_tmp$year_MLE  = paste0(data_tmp$year," (",round(data_tmp$MLE_best_estimate,2),")")
merged_mean        = aggregate(cbind(OLS_estimate,MLE_estimate,OLS_best_estimate,MLE_best_estimate) ~ quantvalue,data=data_tmp, FUN = "mean")
merged_mean$year   = "average"
merged_mean$year_OLS = paste0(merged_mean$year," (",round(merged_mean$OLS_best_estimate,2),")")
merged_mean$year_MLE = paste0(merged_mean$year," (",round(merged_mean$MLE_best_estimate,2),")")
MLE_mean_intercept = mean(data_tmp$MLE_best_estimate)
OLS_mean_intercept = mean(data_tmp$OLS_best_estimate)
if(plot_mean_value == 1){
data_tmp           = rbind(data_tmp,merged_mean)
}
data_abline        = aggregate(cbind(OLS_estimate,MLE_estimate,OLS_best_estimate,MLE_best_estimate) ~ year + year_OLS + year_MLE,data=data_tmp, FUN = "mean")
## define options for figures
selected_year = c(2004,2008,2012,2016,2020,"average")
data_tmp$alpha_opt = 0.0
data_tmp$alpha_opt[which(data_tmp$year %in% selected_year)] <- 0.5
data_abline$alpha_opt = 0.0
data_abline$alpha_opt[which(data_abline$year %in% selected_year)] = 0.5
data_tmp    = data_tmp[which(data_tmp$year %in% selected_year),]
data_abline = data_abline[which(data_abline$year %in% selected_year),]
## color palette
color_plot <- c(brewer.pal(n = 8, name = "Dark2"),"#146C94")
color_plot <- c("#A6761D","#146C94","#7570B3","#1B9E77","black",brewer.pal(n = 8, name = "Dark2"))
#color_plot <- c("#A6761D","#146C94","#7570B3","black",brewer.pal(n = 8, name = "Dark2"))
color_plot    <- c(brewer.pal(n = 5, name = "Dark2"),"black")#"#146C94")
linetype_plot <- c(1,1,1,1,1,1)
alpha_plot    <- c(0.4,0.4,0.4,0.4,0.4,2)
size_plot     <- c(1,1,1,1,1,1.25)
## summary plot
pplotMLE_conso <- ggplot(data_tmp,aes(x=quantvalue, y = MLE_estimate, color=factor(year_MLE), size=factor(year_MLE),  alpha=factor(year_MLE), linetype=factor(year_MLE))) + geom_point(pch=1) + geom_line() + th +
scale_color_manual(values = color_plot) +
scale_linetype_manual(values = linetype_plot) +
scale_alpha_manual(values = alpha_plot) +
scale_size_manual(values = size_plot) +
geom_abline(intercept = as.numeric(MLE_mean_intercept), slope =0, linetype=2, size = 1.25, color="black") +
geom_abline(
mapping = aes(intercept = as.numeric(MLE_best_estimate), slope = 0,  color=factor(year_MLE), size=factor(year_MLE),  alpha=factor(year_MLE)),
data = data_abline, linetype=2,  alpha=data_abline$alpha_opt
)  + ggtitle(paste0(title_plot,", MLE")) +
xlim(minquant_val,1.0) + ylim(range_low,range_high) +
ylab(TeX("Pareto tail ($\\eta^y$)")) + xlab(expression(paste(underline(paste("x"))," (as a quantile of X)")))
pplotMLE_conso
## summary plot
pplotOLS_conso <- ggplot(data_tmp,aes(x=quantvalue, y = OLS_estimate, color=factor(year_OLS), size=factor(year_OLS),  alpha=factor(year_OLS), linetype=factor(year_OLS))) + geom_point(pch=1) + geom_line() + th +
scale_color_manual(values = color_plot) +
scale_linetype_manual(values = linetype_plot) +
scale_alpha_manual(values = alpha_plot) +
scale_size_manual(values = size_plot) +
geom_abline(intercept = as.numeric(OLS_mean_intercept), slope =0, linetype=2, size = 1.25, color="black") +
geom_abline(
mapping = aes(intercept = as.numeric(OLS_best_estimate), slope = 0,  color=factor(year_OLS), size=factor(year_OLS),  alpha=factor(year_OLS)),
data = data_abline, linetype=2,  alpha=data_abline$alpha_opt
) + ggtitle(paste0(title_plot,", OLS")) +
xlim(minquant_val,1.0) + ylim(range_low,range_high) +
ylab(TeX("Pareto tail ($\\eta^y$)")) + xlab(expression(paste(underline(paste("x"))," (as a quantile of X)")))
#pplotOLS_conso
g_adj <- ggarrange(pplotOLS_conso,pplotMLE_conso,common.legend=TRUE)
g_adj
ggsave(paste0(path,"code/top_tails/PSID/output_PSID/",type_income,"_final_scale=",scale_obs,"_residual_OLG=",residual_OLG,".pdf"), width = 7.2, height = 3.39)
if(type_income == "capital_income"){
data_all_year_cap_inc = data_all_year
data_save_check_cap_inc = data_save_check
data_save_check_cap_inc_nadj = data_save_check_nadj
save(data_all_year_cap_inc,file=paste0(path,"code/top_tails/PSID/output_PSID/data_all_year_capinc_scale=",scale_obs,"_removezero=",remove_zero,"_residual_OLG=",residual_OLG,".RData"))
save(data_save_check_cap_inc,file=paste0(path,"code/top_tails/PSID/output_PSID/data_save_check_capinc_scale=",scale_obs,"_removezero=",remove_zero,"_residual_OLG=",residual_OLG,".RData"))
save(data_save_check_cap_inc_nadj,file=paste0(path,"code/top_tails/PSID/output_PSID/data_save_check_capinc_nadj_scale=",scale_obs,"_removezero=",remove_zero,"_residual_OLG=",residual_OLG,".RData"))
}
if(type_income == "labor_income"){
data_all_year_lab_inc = data_all_year
data_save_check_lab_inc = data_save_check
data_save_check_lab_inc_nadj = data_save_check_nadj
save(data_all_year_lab_inc,file=paste0(path,"code/top_tails/PSID/output_PSID/data_all_year_labinc_scale=",scale_obs,"_removezero=",remove_zero,"_residual_OLG=",residual_OLG,".RData"))
save(data_save_check_lab_inc,file=paste0(path,"code/top_tails/PSID/output_PSID/data_save_check_labinc_scale=",scale_obs,"_removezero=",remove_zero,"_residual_OLG=",residual_OLG,".RData"))
save(data_save_check_lab_inc_nadj,file=paste0(path,"code/top_tails/PSID/output_PSID/data_save_check_labinc_nadj_scale=",scale_obs,"_removezero=",remove_zero,"_residual_OLG=",residual_OLG,".RData"))
}
# ----------------------------------------------- #
# Script OUTPUT model
#                    BY ALEXANDRE GAILLARD        #
# ----------------------------------------------- #
# LIBRARY #
library(foreign)
library(readstata13)
library(tidyr)
library(data.table)
library(questionr)
library(plotrix)
library(spatstat)
library(plotly)
library(ggpubr)
library(xtable)
library(latex2exp)
setwd("/Users/alexandregaillard/Dropbox (Personal)/JOLE_revision/Code/SEA_paper/")
## Import theme for ggplot2
th <- theme(
panel.background = element_rect(fill = "white",
colour = "black",
size = 0.7, linetype = "solid"),
panel.border = element_rect(color="black",fill = "transparent",  size = 0.7),
panel.grid.major = element_line(color="#e9e9e9",  size = 0.3, linetype = 2),
panel.grid.minor = element_line(color="#f5f5f5", size = 0.3, linetype = 2),
legend.title=element_blank(),
legend.text = element_text(size = 12,family="Palatino",color="black"),
legend.key.width = unit(1,"cm"),
legend.key.height= unit(0.4,"cm"),
legend.key = element_rect(fill = "transparent",colour="transparent"),
legend.margin = margin(3, 6, 3, 3),
legend.box.margin=margin(1,1,1,1),
legend.spacing.x = unit(1, "mm"),
legend.spacing.y = unit(0, "mm"),
legend.justification = c("center", "top"),
legend.background = element_rect(fill = "white",colour="black",size=0.3,linetype = "solid"),
plot.margin = margin(0.5,0.5,0.1,0.1, "cm"),
axis.title = element_text(size=12,family="Palatino",color="black"),
axis.text = element_text(size=12,family="Palatino",color="black"),
axis.line = element_line(color="black", size = 0.0, linetype = 1),
plot.title = element_text(size=12,family="Palatino",color="black"))
##### --  Figure 3 (Average survival rate) #####
## for the average survival rate, it is just workers 0.8 + 0.2*unemployed one. at the beginning.
data_survival = read.table("OUTPUT/survival_10y_POLICY=0_OPT_labor_demand=1_Teq=1_Weq=1_Req=1_type=0_rhostar=0.400000_pLTpar=0.500000.txt",sep=",")
data_WW     = as.data.frame(cbind(rep("Worker",21),unlist((data_survival[1,])),seq(1,21)))
data_WW$V2  = as.numeric(data_WW$V2)
data_WW$V2[1] = 1
data_U      = as.data.frame(cbind(rep("Unemployed",21),unlist((data_survival[2,])),seq(1,21)))
data_U$V2   = as.numeric(data_U$V2)
data_U$V2[1] = 1
data_NEC    = as.data.frame(cbind(rep("Necessity share",21),unlist((data_survival[3,])),seq(1,21)))
data_NEC$V2 = as.numeric(data_NEC$V2)
data_NEC$V2[1] = 1
ggplot_data = rbind(data_WW,data_U,data_NEC)
p_fig3_survival <- ggplot(ggplot_data,aes(y=as.numeric(V2),x=as.numeric(V3),col=V1,linetype=V1)) + geom_line(size=1) + # geom_point(size=1.2) +
scale_color_manual(values=c("#3468C0","purple","black")) + th + ylim(0,1) + ylab("survival rate") + xlab("quarters") +
theme(legend.position = c(0.8, 0.9))
p_fig3_survival
data_size = read.table("OUTPUT/size_10y_POLICY=0_OPT_labor_demand=1_Teq=1_Weq=1_Req=1_type=0_rhostar=0.400000_pLTpar=0.500000.txt",sep=",")
data_WW     = as.data.frame(cbind(rep("Worker",21),unlist((data_size[1,])),seq(1,21)))
data_WW$V2  = as.numeric(data_WW$V2)
data_U      = as.data.frame(cbind(rep("Unemployed",21),unlist((data_size[2,])),seq(1,21)))
data_U$V2   = as.numeric(data_U$V2)
data_NEC    = as.data.frame(cbind(rep("Necessity share",21),unlist((data_size[3,])),seq(1,21)))
data_NEC$V2 = as.numeric(data_NEC$V2)
ggplot_data = rbind(data_WW,data_U,data_NEC)
p_fig3_size <- ggplot(ggplot_data,aes(y=as.numeric(V2),x=as.numeric(V3),col=V1,linetype=V1)) + geom_line(size=1) +  #geom_point(size=1.2) +
scale_color_manual(values=c("#3468C0","purple","black")) + th + ylab("average size") + xlab("quarters") +
theme(legend.position = c(0.8, 0.9))
p_fig3_size
p_fig3 <- ggarrange(p_fig3_survival,p_fig3_size,common.legend = TRUE)
p_fig3
ggsave("figure_baseline.pdf",p_fig3,path="/Users/alexandregaillard/Dropbox (Personal)/JOLE_revision/figure_paper",width=8.0,height=3.4)
#### Table 3. ####
data_benchmark   = read.table("OUTPUT/SS_statistics_POLICY=0_OPT_labor_demand=1_Teq=1_Weq=1_Req=1_rhostar=0.400000_pLTpar=0.500000.txt",sep="\t",header=T)
data_SEA         = read.table("OUTPUT/SS_statistics_POLICY=1_OPT_labor_demand=1_Teq=1_Weq=1_Req=1_rhostar=0.400000_pLTpar=0.500000.txt",sep="\t",header=T)
data_SEANB       = read.table("OUTPUT/SS_statistics_POLICY=3_OPT_labor_demand=1_Teq=1_Weq=1_Req=1_rhostar=0.400000_pLTpar=0.500000.txt",sep="\t",header=T)
data_SEALS       = read.table("OUTPUT/SS_statistics_POLICY=2_OPT_labor_demand=1_Teq=1_Weq=1_Req=1_rhostar=0.400000_pLTpar=0.500000.txt",sep="\t",header=T)
data_benchmark$C_welfaretot
data_benchmark$welfare
calculate_new_creation <- function(data, data_benchmark) {
return(500000 * (data$FracUtoE * data$Urate / 100 + data$FracWtoE * data$Corp_jobs) /
(data_benchmark$FracUtoE * data_benchmark$Urate / 100 + data_benchmark$FracWtoE * data_benchmark$Corp_jobs))
}
calculate_CEV <- function(data, data_benchmark) {
return(((data$welfare/data_benchmark$welfare))^(1/(1-1.5))-1)
}
prepare_table_data <- function(data, data_benchmark) {
new_creation <- calculate_new_creation(data, data_benchmark)
new_CEV <- calculate_CEV(data, data_benchmark)
return(c(data$fracE * 100,
data$Urate, data$FracUtoE * 100,
new_creation,  data$ExitE * 100,
data$Bankruptcy,  data$Necessity_share * 100,
data$total_labor, (data$total_labor-data$E_jobs)/data$total_labor*100,
(data$K_Corp+data$K_E), data$K_Corp/(data$K_Corp+data$K_E) * 100,
data$Y, data$fracEi/data$fracE*100,  data$tax_rate * 100,
data$cost_over_gdp*100, new_CEV*100))
}
table3_benchmark <- prepare_table_data(data_benchmark, data_benchmark)
table3_SEA <- prepare_table_data(data_SEA, data_benchmark)
table3_SEANB <- prepare_table_data(data_SEANB, data_benchmark)
table3_SEALS <- prepare_table_data(data_SEALS, data_benchmark)
# Function to calculate deviation, with percent deviation for the last 3 entries
calculate_deviation <- function(table, benchmark) {
percent_deviation = table
percent_deviation[c(1:12)] <- (table[c(1:12)] - benchmark[c(1:12)])/benchmark[c(1:12)] * 100
percent_deviation[c(13,14,15,16)] <- (table[c(13,14,15,16)])
return(c(percent_deviation))
}
# Assuming table3_benchmark, table3_SEA, table3_SEANB, and table3_SEALS are already defined
deviation_SEA <- calculate_deviation(table3_SEA, table3_benchmark)
deviation_SEANB <- calculate_deviation(table3_SEANB, table3_benchmark)
deviation_SEALS <- calculate_deviation(table3_SEALS, table3_benchmark)
# Combine deviations into a single table
deviation_table <- data.frame(
SEA = deviation_SEA,
SEANB = deviation_SEANB,
SEALS = deviation_SEALS
)
deviation_table = round(cbind(table3_benchmark,deviation_table),3)
rownames(deviation_table) <- c("Fraction of self-employed (%)",
"Unemployment rate (%)","Fraction unemployed starting businesses (%)",
"New firms per year","Self-employment exit rate (%)",
"Bankruptcy rate per quarter (%)","Necessity share (%)",
"Total labor","Share L in corporate sector",
"Total capital","Share K in corporate sector",
"Total production","Fraction of insured self-employed (%)","Tax rate (%)","Cost of the policy/gdp (%)",
"Consumption Equivalent Variation (CEV) (%)")
xtable(deviation_table[,c("table3_benchmark","SEA","SEALS","SEANB")],digits=2)
xtable(deviation_table[,c("table3_benchmark","SEA","SEALS","SEANB")],digits=3)
## becareful the overall welfare is reported in the file of the VEILESS of the transition.
#### Table General Equilibrium ####
data_SEA_bench               = read.table("OUTPUT/SS_statistics_POLICY=1_OPT_labor_demand=1_Teq=1_Weq=0_Req=0_rhostar=0.400000_pLTpar=0.500000.txt",sep="\t",header=T)
data_SEA_no_change         = read.table("OUTPUT/SS_statistics_POLICY=1_OPT_labor_demand=1_Teq=0_Weq=0_Req=0_rhostar=0.400000_pLTpar=0.500000.txt",sep="\t",header=T)
data_SEA_no_t_change         = read.table("OUTPUT/SS_statistics_POLICY=1_OPT_labor_demand=1_Teq=1_Weq=0_Req=0_rhostar=0.400000_pLTpar=0.500000.txt",sep="\t",header=T)
data_SEA_no_w_change         = read.table("OUTPUT/SS_statistics_POLICY=1_OPT_labor_demand=1_Teq=0_Weq=1_Req=0_rhostar=0.400000_pLTpar=0.500000.txt",sep="\t",header=T)
data_SEA_no_r_change         = read.table("OUTPUT/SS_statistics_POLICY=1_OPT_labor_demand=1_Teq=0_Weq=0_Req=1_rhostar=0.400000_pLTpar=0.500000.txt",sep="\t",header=T)
prepare_table_data <- function(data, data_benchmark) {
new_creation <- calculate_new_creation(data, data_benchmark)
new_CEV <- calculate_CEV(data, data_benchmark)
return(c(data$fracE * 100,
data$Urate,
data$FracUtoE * 100,
new_creation,
data$ExitE * 100,
data$total_labor,
(data$K_Corp+data$K_E)))
}
table3_benchmark <- prepare_table_data(data_benchmark, data_benchmark)
table3_SEA <- prepare_table_data(data_SEA, data_benchmark)
table3_SEA_no <- prepare_table_data(data_SEA_no_change, data_benchmark)
table3_SEA_no_t <- prepare_table_data(data_SEA_no_t_change, data_benchmark)
table3_SEA_no_w <- prepare_table_data(data_SEA_no_w_change, data_benchmark)
table3_SEA_no_r <- prepare_table_data(data_SEA_no_r_change, data_benchmark)
# Function to calculate deviation, with percent deviation for the last 3 entries
calculate_deviation <- function(table, benchmark) {
percent_deviation = (table-benchmark)/benchmark*100
return(c(percent_deviation))
}
# Assuming table3_benchmark, table3_SEA, table3_SEANB, and table3_SEALS are already defined
deviation_SEA <- calculate_deviation(table3_SEA, table3_benchmark)
deviation_SEA_no <- calculate_deviation(table3_SEA_no, table3_benchmark)
deviation_SEA_no_t <- calculate_deviation(table3_SEA_no_t, table3_benchmark)
deviation_SEA_no_w <- calculate_deviation(table3_SEA_no_w, table3_benchmark)
deviation_SEA_no_r <- calculate_deviation(table3_SEA_no_r, table3_benchmark)
# Combine deviations into a single table
deviation_table <- data.frame(
SEA = deviation_SEA,
SEA_no_adjust = deviation_SEA_no,
SEA_t_adjust = deviation_SEA_no_t,
SEA_w_adjust = deviation_SEA_no_w,
SEA_r_adjust = deviation_SEA_no_r
)
deviation_table = round(cbind(deviation_table),3)
rownames(deviation_table) <- c("Fraction of self-employed (%)",
"Unemployment rate (%)",
"Fraction unemployed starting businesses (%)",
"New firms per year",
"Self-employment exit rate (%)",
"Total labor",
"Total capital")
xtable(deviation_table[,c("SEA","SEA_no_adjust","SEA_t_adjust","SEA_w_adjust", "SEA_r_adjust")],digits=2)
## becareful the overall welfare is reported in the f
